(df_a <-
read_delim("data/apr13.txt", delim = "\t") %>%
rename(name_suffix = name_sufx_cd) %>%
mutate_if(is.character, str_squish) %>%
mutate(id_a = row_number()))## # A tibble: 16,271 x 71
## county_id county_desc voter_reg_num status_cd voter_status_desc
## <int> <chr> <chr> <chr> <chr>
## 1 100 YANCEY 000000014683 A ACTIVE
## 2 100 YANCEY 000000014722 A ACTIVE
## 3 100 YANCEY 000000010135 A ACTIVE
## 4 100 YANCEY 000000022654 A ACTIVE
## 5 100 YANCEY 000000020123 R REMOVED
## 6 100 YANCEY 000000017105 R REMOVED
## 7 100 YANCEY 000000014537 A ACTIVE
## 8 100 YANCEY 000000019234 A ACTIVE
## 9 100 YANCEY 000000002043 A ACTIVE
## 10 100 YANCEY 000000019138 A ACTIVE
## # ... with 16,261 more rows, and 66 more variables: reason_cd <chr>,
## # voter_status_reason_desc <chr>, absent_ind <chr>, name_prefx_cd <chr>,
## # last_name <chr>, first_name <chr>, midl_name <chr>, name_suffix <chr>,
## # res_street_address <chr>, res_city_desc <chr>, state_cd <chr>,
## # zip_code <int>, mail_addr1 <chr>, mail_addr2 <chr>, mail_addr3 <chr>,
## # mail_addr4 <chr>, mail_city <chr>, mail_state <chr>,
## # mail_zipcode <int>, full_phone_number <chr>, race_code <chr>,
## # ethnic_code <chr>, party_cd <chr>, gender_code <chr>, birth_age <int>,
## # birth_place <chr>, registr_dt <chr>, precinct_abbrv <chr>,
## # precinct_desc <chr>, municipality_abbrv <chr>,
## # municipality_desc <chr>, ward_abbrv <chr>, ward_desc <chr>,
## # cong_dist_abbrv <chr>, super_court_abbrv <chr>,
## # judic_dist_abbrv <chr>, nc_senate_abbrv <chr>, nc_house_abbrv <chr>,
## # county_commiss_abbrv <chr>, county_commiss_desc <chr>,
## # township_abbrv <chr>, township_desc <chr>, school_dist_abbrv <chr>,
## # school_dist_desc <chr>, fire_dist_abbrv <chr>, fire_dist_desc <chr>,
## # water_dist_abbrv <chr>, water_dist_desc <chr>, sewer_dist_abbrv <chr>,
## # sewer_dist_desc <chr>, sanit_dist_abbrv <chr>, sanit_dist_desc <chr>,
## # rescue_dist_abbrv <chr>, rescue_dist_desc <chr>,
## # munic_dist_abbrv <chr>, munic_dist_desc <chr>, dist_1_abbrv <chr>,
## # dist_1_desc <chr>, dist_2_abbrv <chr>, dist_2_desc <chr>,
## # confidential_ind <chr>, age <chr>, ncid <chr>, vtd_abbrv <chr>,
## # vtd_desc <chr>, id_a <int>
(df_b <-
read_delim("data/mar17.txt", delim = "\t") %>%
rename(name_suffix = name_suffix_lbl) %>%
mutate_if(is.character, str_squish) %>%
mutate(id_b = row_number()))## # A tibble: 16,276 x 72
## county_id county_desc voter_reg_num status_cd voter_status_desc
## <int> <chr> <chr> <chr> <chr>
## 1 100 YANCEY 000000026581 A ACTIVE
## 2 100 YANCEY 000000014683 A ACTIVE
## 3 100 YANCEY 000000014722 A ACTIVE
## 4 100 YANCEY 000000010135 A ACTIVE
## 5 100 YANCEY 000000025582 A ACTIVE
## 6 100 YANCEY 000000022654 I INACTIVE
## 7 100 YANCEY 000000020123 R REMOVED
## 8 100 YANCEY 000000026587 A ACTIVE
## 9 100 YANCEY 000000017105 R REMOVED
## 10 100 YANCEY 000000026924 A ACTIVE
## # ... with 16,266 more rows, and 67 more variables: reason_cd <chr>,
## # voter_status_reason_desc <chr>, absent_ind <chr>, name_prefx_cd <chr>,
## # last_name <chr>, first_name <chr>, middle_name <chr>,
## # name_suffix <chr>, res_street_address <chr>, res_city_desc <chr>,
## # state_cd <chr>, zip_code <int>, mail_addr1 <chr>, mail_addr2 <chr>,
## # mail_addr3 <chr>, mail_addr4 <chr>, mail_city <chr>, mail_state <chr>,
## # mail_zipcode <chr>, full_phone_number <dbl>, race_code <chr>,
## # ethnic_code <chr>, party_cd <chr>, gender_code <chr>, birth_age <int>,
## # birth_state <chr>, drivers_lic <chr>, registr_dt <chr>,
## # precinct_abbrv <chr>, precinct_desc <chr>, municipality_abbrv <chr>,
## # municipality_desc <chr>, ward_abbrv <chr>, ward_desc <chr>,
## # cong_dist_abbrv <int>, super_court_abbrv <int>,
## # judic_dist_abbrv <int>, nc_senate_abbrv <int>, nc_house_abbrv <int>,
## # county_commiss_abbrv <chr>, county_commiss_desc <chr>,
## # township_abbrv <chr>, township_desc <chr>, school_dist_abbrv <chr>,
## # school_dist_desc <chr>, fire_dist_abbrv <chr>, fire_dist_desc <chr>,
## # water_dist_abbrv <chr>, water_dist_desc <chr>, sewer_dist_abbrv <chr>,
## # sewer_dist_desc <chr>, sanit_dist_abbrv <chr>, sanit_dist_desc <chr>,
## # rescue_dist_abbrv <chr>, rescue_dist_desc <chr>,
## # munic_dist_abbrv <chr>, munic_dist_desc <chr>, dist_1_abbrv <int>,
## # dist_1_desc <chr>, dist_2_abbrv <chr>, dist_2_desc <chr>,
## # confidential_ind <chr>, age <chr>, ncid <chr>, vtd_abbrv <chr>,
## # vtd_desc <chr>, id_b <int>
## # A tibble: 16,271 x 10
## id_a fname lname birth_age gender_code race_code voter_reg_num
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 1 KAREN ABERNATHY 62 F W 000000014683
## 2 2 TRAVIS ABERNATHY 70 M W 000000014722
## 3 3 RONALD ABRAHAM 63 M W 000000010135
## 4 4 DARCI ABRAMS 33 F W 000000022654
## 5 5 LYDIA ABRAMS 44 F W 000000020123
## 6 6 JOSHUA ACUFF 27 M W 000000017105
## 7 7 BARBARA ADAMS 65 F W 000000014537
## 8 8 CAROLYN ADAMS 63 F W 000000019234
## 9 9 DAVID ADAMS 57 M W 000000002043
## 10 10 JESSICA ADAMS 34 F W 000000019138
## # ... with 16,261 more rows, and 3 more variables: birth_year <dbl>,
## # ffreq <dbl>, lfreq <dbl>
## # A tibble: 16,276 x 10
## id_b fname lname birth_age gender_code race_code voter_reg_num
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 1 PAMELA ABARE 49 F W 000000026581
## 2 2 KAREN ABERNATHY 66 F W 000000014683
## 3 3 TRAVIS ABERNATHY 73 M W 000000014722
## 4 4 RONALD ABRAHAM 67 M W 000000010135
## 5 5 BRIAN ABRAM 36 M W 000000025582
## 6 6 DARCI ABRAMS 36 F W 000000022654
## 7 7 LYDIA ABRAMS 48 F W 000000020123
## 8 8 MALINDA ACHARD 43 F W 000000026587
## 9 9 JOSHUA ACUFF 31 M W 000000017105
## 10 10 BARBARA ADAMS 78 F W 000000026924
## # ... with 16,266 more rows, and 3 more variables: birth_year <dbl>,
## # ffreq <dbl>, lfreq <dbl>
(df_exact_matches <-
df_a_mod %>%
select(id_a, fname, lname, birth_year, gender_code, race_code) %>%
inner_join(df_b_mod %>%
select(id_b, fname, lname, birth_year, gender_code, race_code)) %>%
select(starts_with("id")))## # A tibble: 12,502 x 2
## id_a id_b
## <int> <int>
## 1 1 2
## 2 3 4
## 3 5 7
## 4 6 9
## 5 7 11
## 6 8 12
## 7 14 17
## 8 15 19
## 9 17 20
## 10 18 21
## # ... with 12,492 more rows
(df_vrn_matches <-
df_a_mod %>%
select(id_a, voter_reg_num) %>%
inner_join(df_b_mod %>%
select(id_b, voter_reg_num),
by = "voter_reg_num") %>%
select(starts_with("id")) %>%
mutate(match = "match") %>%
distinct())## # A tibble: 13,495 x 3
## id_a id_b match
## <int> <int> <chr>
## 1 1 2 match
## 2 2 3 match
## 3 3 4 match
## 4 4 6 match
## 5 5 7 match
## 6 6 9 match
## 7 7 11 match
## 8 8 12 match
## 9 10 11572 match
## 10 12 15 match
## # ... with 13,485 more rows
df_matches_unexact <-
df_vrn_matches %>%
anti_join(df_exact_matches) %>%
left_join(df_a_mod, by = "id_a") %>%
left_join(df_b_mod, by = "id_b", suffix = c("_a", "_b")) %>%
mutate(pair_id = row_number())
data_path <- "data/paper/"
data_path_file <- function(file_name) glue("{data_path}{file_name}")
if(dir_exists(data_path)){
dir_create(data_path)
}
(df_matches_unexact %>%
write_rds(data_path_file("df_matches_unexact.rds")))## # A tibble: 1,271 x 22
## id_a id_b match fname_a lname_a birth_age_a gender_code_a race_code_a
## <int> <int> <chr> <chr> <chr> <int> <chr> <chr>
## 1 2 3 match TRAVIS ABERNA… 70 M W
## 2 4 6 match DARCI ABRAMS 33 F W
## 3 10 11572 match JESSICA ADAMS 34 F W
## 4 12 15 match KELSEY ADAMS 22 F W
## 5 20 22 match CONNIE ADKINS 52 F W
## 6 27 6956 match FELICIA ADKINS 51 F W
## 7 44 12560 match STACY ADKINS 35 F W
## 8 51 48 match BETHANY ADKINS… 30 F W
## 9 62 57 match KELLY AKINS 39 F W
## 10 85 82 match AMBER ALLEN 37 F W
## # ... with 1,261 more rows, and 14 more variables: voter_reg_num_a <chr>,
## # birth_year_a <dbl>, ffreq_a <dbl>, lfreq_a <dbl>, fname_b <chr>,
## # lname_b <chr>, birth_age_b <int>, gender_code_b <chr>,
## # race_code_b <chr>, voter_reg_num_b <chr>, birth_year_b <dbl>,
## # ffreq_b <dbl>, lfreq_b <dbl>, pair_id <int>
df_match_block <-
df_matches_unexact %>%
select(matches(or("id", "fname", "lname"))) %>%
mutate(fname_soundex_a = soundex(fname_a),
fname_soundex_b = soundex(fname_b),
fname_dm_a = map(fname_a, DoubleMetaphone),
fname_dm1_a = map_chr(fname_dm_a, 1),
fname_dm2_a = map_chr(fname_dm_a, 2),
fname_dm_b = map(fname_b, DoubleMetaphone),
fname_dm1_b = map_chr(fname_dm_b, 1),
fname_dm2_b = map_chr(fname_dm_b, 2),
lname_soundex_a = soundex(lname_a),
lname_soundex_b = soundex(lname_b),
lname_dm_a = map(lname_a, DoubleMetaphone),
lname_dm1_a = map_chr(lname_dm_a, 1),
lname_dm2_a = map_chr(lname_dm_a, 2),
lname_dm_b = map(lname_b, DoubleMetaphone),
lname_dm1_b = map_chr(lname_dm_b, 1),
lname_dm2_b = map_chr(lname_dm_b, 2)) %>%
select(-fname_dm_a, -fname_dm_b, -lname_dm_a, -lname_dm_b)
df_a_block <-
df_a_mod %>%
select(id_a, fname, lname) %>%
rename(fname_a = fname,
lname_a = lname) %>%
mutate(fname_soundex_a = soundex(fname_a),
fname_dm_a = map(fname_a, DoubleMetaphone),
fname_dm1_a = map_chr(fname_dm_a, 1),
fname_dm2_a = map_chr(fname_dm_a, 2),
lname_soundex_a = soundex(lname_a),
lname_dm_a = map(lname_a, DoubleMetaphone),
lname_dm1_a = map_chr(lname_dm_a, 1),
lname_dm2_a = map_chr(lname_dm_a, 2)) %>%
select(-fname_dm_a, -lname_dm_a)
df_b_block <-
df_b_mod %>%
select(id_b, fname, lname) %>%
rename(fname_b = fname,
lname_b = lname) %>%
mutate(fname_soundex_b = soundex(fname_b),
fname_dm_b = map(fname_b, DoubleMetaphone),
fname_dm1_b = map_chr(fname_dm_b, 1),
fname_dm2_b = map_chr(fname_dm_b, 2),
lname_soundex_b = soundex(lname_b),
lname_dm_b = map(lname_b, DoubleMetaphone),
lname_dm1_b = map_chr(lname_dm_b, 1),
lname_dm2_b = map_chr(lname_dm_b, 2)) %>%
select(-fname_dm_b, -lname_dm_b)
df_match_block_a <-
df_match_block %>%
select(contains("_a"))
df_match_block_b <-
df_match_block %>%
select(contains("_b"))
df_all_combos <-
bind_rows(
df_match_block_a %>%
inner_join(df_b_block, by = c("fname_soundex_a" = "fname_soundex_b")) %>%
select(id_a, id_b),
df_match_block_a %>%
inner_join(df_b_block, by = c("fname_dm1_a" = "fname_dm1_b")) %>%
select(id_a, id_b),
df_match_block_a %>%
inner_join(df_b_block, by = c("fname_dm2_a" = "fname_dm2_b")) %>%
select(id_a, id_b),
df_match_block_a %>%
inner_join(df_b_block, by = c("lname_soundex_a" = "lname_soundex_b")) %>%
select(id_a, id_b),
df_match_block_a %>%
inner_join(df_b_block, by = c("lname_dm1_a" = "lname_dm1_b")) %>%
select(id_a, id_b),
df_match_block_a %>%
inner_join(df_b_block, by = c("lname_dm2_a" = "lname_dm2_b")) %>%
select(id_a, id_b),
df_match_block_b %>%
inner_join(df_a_block, by = c("fname_soundex_b" = "fname_soundex_a")) %>%
select(id_a, id_b),
df_match_block_b %>%
inner_join(df_a_block, by = c("fname_dm1_b" = "fname_dm1_a")) %>%
select(id_a, id_b),
df_match_block_b %>%
inner_join(df_a_block, by = c("fname_dm2_b" = "fname_dm2_a")) %>%
select(id_a, id_b),
df_match_block_b %>%
inner_join(df_a_block, by = c("lname_soundex_b" = "lname_soundex_a")) %>%
select(id_a, id_b),
df_match_block_b %>%
inner_join(df_a_block, by = c("lname_dm1_b" = "lname_dm1_a")) %>%
select(id_a, id_b),
df_match_block_b %>%
inner_join(df_a_block, by = c("lname_dm2_b" = "lname_dm2_a")) %>%
select(id_a, id_b)
) %>%
distinct()
df_all_combos_nested <-
df_all_combos %>%
left_join(df_a_mod, by = "id_a") %>%
left_join(df_b_mod, by = "id_b", suffix = c("_a", "_b")) %>%
group_by(id_a, id_b) %>%
nest()
n_unique <- function(x) x %>% unique() %>% length()
rename_weight <- function(x) str_c(x, "_weight")
df_to_vector <- function(df) df %>% .[1, ] %>% unclass() %>% as.double()
weight_vector <-
df_a_mod %>%
bind_rows(df_b_mod) %>%
select(fname, gender_code, race_code, birth_year) %>%
summarise_all(n_unique) %>%
rename_all(rename_weight) %>%
mutate(sum =
fname_weight +
# lname_weight +
gender_code_weight +
race_code_weight +
birth_year_weight) %>%
mutate_all(function(x, all) x/all, all = .$sum) %>%
select(-sum) %>%
df_to_vector()
df_all_combos_nested_sim <-
df_all_combos_nested %>%
mutate(x = map_dbl(data, calculate_hamming_fields))
df_unmatches_unexact <-
df_all_combos_nested_sim %>%
semi_join(df_matches_unexact, by = "id_a") %>%
group_by(id_a) %>%
arrange(desc(x), .by_group = T) %>%
slice(1:5) %>%
ungroup() %>%
select(starts_with("id")) %>%
left_join(df_a_mod, by = "id_a") %>%
left_join(df_b_mod, by = "id_b", suffix = c("_a", "_b")) %>%
mutate(pair_id = 986398 + row_number()) %>%
mutate(match = "unmatch")
(df_unmatches_unexact %>%
write_rds(data_path_file("df_unmatches_unexact.rds")))## # A tibble: 6,347 x 22
## id_a id_b fname_a lname_a birth_age_a gender_code_a race_code_a
## <int> <int> <chr> <chr> <int> <chr> <chr>
## 1 2 3 TRAVIS ABERNATHY 70 M W
## 2 2 626 TRAVIS ABERNATHY 70 M W
## 3 2 799 TRAVIS ABERNATHY 70 M W
## 4 2 1982 TRAVIS ABERNATHY 70 M W
## 5 2 2113 TRAVIS ABERNATHY 70 M W
## 6 4 6 DARCI ABRAMS 33 F W
## 7 4 9757 DARCI ABRAMS 33 F W
## 8 4 1080 DARCI ABRAMS 33 F W
## 9 4 2265 DARCI ABRAMS 33 F W
## 10 4 2398 DARCI ABRAMS 33 F W
## # ... with 6,337 more rows, and 15 more variables: voter_reg_num_a <chr>,
## # birth_year_a <dbl>, ffreq_a <dbl>, lfreq_a <dbl>, fname_b <chr>,
## # lname_b <chr>, birth_age_b <int>, gender_code_b <chr>,
## # race_code_b <chr>, voter_reg_num_b <chr>, birth_year_b <dbl>,
## # ffreq_b <dbl>, lfreq_b <dbl>, pair_id <dbl>, match <chr>
set.seed(1)
df_pairs <-
df_unmatches_unexact %>%
bind_rows(df_matches_unexact) %>%
sample_n((nrow(.)))
(df_pairs %>%
write_rds(data_path_file("df_pairs.rds")))## # A tibble: 7,618 x 22
## id_a id_b fname_a lname_a birth_age_a gender_code_a race_code_a
## <int> <int> <chr> <chr> <int> <chr> <chr>
## 1 4730 4650 DARRELL FORBES 49 M W
## 2 7031 4254 GARRY HOPSON 43 M W
## 3 10995 4867 CARMON PETERSON 57 M W
## 4 7090 7129 LISA HOUSLEY 38 F W
## 5 3576 3621 NORWOOD CUMMINS… 72 M W
## 6 6037 6093 CHARLES HARRIS 84 M W
## 7 10527 10552 AMANDA NORTON 28 F W
## 8 12752 153 MARGARET - … RUTKOWS… 31 F W
## 9 12166 7935 MARTHA RIDDLE 74 F W
## 10 1151 283 AMANDA BENNETT 66 F W
## # ... with 7,608 more rows, and 15 more variables: voter_reg_num_a <chr>,
## # birth_year_a <dbl>, ffreq_a <dbl>, lfreq_a <dbl>, fname_b <chr>,
## # lname_b <chr>, birth_age_b <int>, gender_code_b <chr>,
## # race_code_b <chr>, voter_reg_num_b <chr>, birth_year_b <dbl>,
## # ffreq_b <dbl>, lfreq_b <dbl>, pair_id <dbl>, match <chr>
## # A tibble: 15,236 x 12
## pair_id fname lname gender_code race_code birth_year match id
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <int>
## 1 1 TRAVIS ABERNATHY M W 1943 match 2
## 2 1 TRAVIS ABERNATHY M W 1944 match 3
## 3 2 DARCI ABRAMS F W 1980 match 4
## 4 2 DARCI ABRAMS F W 1981 match 6
## 5 3 JESSICA ADAMS F W 1979 match 10
## 6 3 JESSICA RAHN F W 1979 match 11572
## 7 4 KELSEY ADAMS F W 1991 match 12
## 8 4 KELSEY ADAMS F W 1992 match 15
## 9 5 CONNIE ADKINS F W 1961 match 20
## 10 5 CONNIE ADKINS F W 1962 match 22
## # ... with 15,226 more rows, and 4 more variables: birthge_a <int>,
## # voter_reg_num <chr>, ffreq <dbl>, lfreq <dbl>
The above pipeline was wrapped into a function, so that database A can be made noisy and the pair generation can be repeated.
df_error_table <-
read_csv("R/paper/error_table.csv")
set.seed(13)
df_messed_collection <-
tibble(error_rate = seq(0, 40, 10)) %>%
mutate(df_a_mod = map(error_rate, function(e){
# browser()
df_a_mod %>%
prep_data() %>%
mess_data(df_error_table %>%
mutate(amount = amount*e/10)) %>%
pluck("df_secondary") %>%
select(-file, -id) %>%
mutate_if(is.character, str_to_upper) %>%
mutate(birth_year = as.integer(birth_year)) %>%
select(-ffreq, -lfreq) %>%
add_count(fname) %>%
rename(ffreq = n) %>%
add_count(lname) %>%
rename(lfreq = n) %>%
mutate(ffreq = scale(ffreq),
lfreq = scale(lfreq))
}))
df_messed_collection <-
df_messed_collection %>%
mutate(df_pairs = map2(df_a_mod, error_rate, function(df_a_mod, e) {
message("________________________")
message(glue("Error rate: {e}%"))
generate_pairs(df_a_mod = df_a_mod,
df_b_mod = df_b_mod,
data_pref =
e %>%
str_pad(width = 2, side = "left", pad = "0") %>%
str_c("err_", .))
}))
df_messed_collection %>%
write_rds("data/paper/error_data/df_messed_collection.rds")
df_messed_collection
## # A tibble: 5 x 3
## error_rate df_a_mod df_pairs
## <dbl> <list> <list>
## 1 0 <tibble [16,271 × 10]> <tibble [7,618 × 22]>
## 2 0.1 <tibble [16,271 × 10]> <tibble [14,391 × 22]>
## 3 0.2 <tibble [16,271 × 10]> <tibble [18,932 × 22]>
## 4 0.3 <tibble [16,271 × 10]> <tibble [23,588 × 22]>
## 5 0.4 <tibble [16,271 × 10]> <tibble [27,735 × 22]>
Using a function called add_feature_vector to generate features.
df_messed_collection <-
df_messed_collection %>%
mutate(df_feature = map(df_pairs, function(df_pair){
df_pair %>%
add_feature_vector() %>%
select(starts_with("metric"), match) %>%
mutate(match = match %>% factor(levels = c("unmatch", "match"))) %>%
as.data.frame()
}))
df_messed_collection %>%
write_rds("data/paper/error_data/df_feature_collection.rds")
df_messed_collection
## # A tibble: 5 x 4
## error_rate df_a_mod df_pairs df_feature
## <dbl> <list> <list> <list>
## 1 0 <tibble [16,271 × 10]> <tibble [7,618 × 22]> <data.frame [7…
## 2 0.1 <tibble [16,271 × 10]> <tibble [14,391 × 22]> <data.frame [1…
## 3 0.2 <tibble [16,271 × 10]> <tibble [18,932 × 22]> <data.frame [1…
## 4 0.3 <tibble [16,271 × 10]> <tibble [23,588 × 22]> <data.frame [2…
## 5 0.4 <tibble [16,271 × 10]> <tibble [27,735 × 22]> <data.frame [2…
One df_feature dataframe looks like these.
## # A tibble: 7,618 x 16
## metric_gender_co… metric_gender_co… metric_gender_cod… metric_race_cod…
## <int> <int> <lgl> <lgl>
## 1 0 2 TRUE TRUE
## 2 0 2 TRUE TRUE
## 3 0 2 TRUE TRUE
## 4 2 0 TRUE TRUE
## 5 0 2 TRUE TRUE
## 6 0 2 TRUE TRUE
## 7 2 0 TRUE TRUE
## 8 2 0 TRUE TRUE
## 9 2 0 TRUE TRUE
## 10 2 0 TRUE TRUE
## # ... with 7,608 more rows, and 12 more variables:
## # metric_race_code_ww_bb <lgl>, metric_year_diff_abs <int>,
## # metric_age <dbl>, metric_ffreq_mean <dbl>, metric_lfreq_mean <dbl>,
## # metric_fname_dl <dbl>, metric_fname_jw <dbl>,
## # metric_fname_soundex <dbl>, metric_lname_dl <dbl>,
## # metric_lname_jw <dbl>, metric_lname_soundex <dbl>, match <fct>
sample_strat <- function(df, n = 5000){
match <- df$match
row_num <- 1:nrow(df)
m = ceiling(n/6)
u = n - m
indices_m = sample(row_num[match == "match"], m)
indices_u = sample(row_num[match == "unmatch"], u)
sample(c(indices_u, indices_m), n)
}
df_all <-
df_messed_collection$df_feature[[1]]
set.seed(23)
df_messed_collection <-
df_messed_collection %>%
mutate(train_indices = map(df_pairs, sample_strat, n = 5000),
test_indices = map2(df_feature, train_indices, function(df, t){
(1:nrow(df))[-t]
}),
df_train = map2(df_feature, train_indices, function(df, t){
df[t, ]
}),
df_test =
pmap(list(df_pairs, df_feature, test_indices),
function(pairs, df, v){
bind_cols(pairs, df) %>%
as.data.frame() %>%
.[v, ]
}))
train_control <-
trainControl(method = "cv",
number = 10,
verboseIter = FALSE,
savePredictions = TRUE,
classProbs = TRUE)
df_messed_collection <-
df_messed_collection %>%
mutate(
model_rf = map(df_train, function(df_tr){
# browser()
set.seed(3)
train(match ~ .,
df_tr,
trControl = train_control,
tuneGrid = expand.grid(.mtry = seq(1, 6, 1)),
importance = TRUE,
keep.forest= TRUE,
ntree = 350,
method = "rf")
}),
model_svm_radial = map(df_train, function(df_tr){
# browser()
set.seed(3)
train(match ~ .,
df_tr,
trControl = train_control,
method = "svmRadial")
}),
model_svm_linear = map(df_train, function(df_tr){
# browser()
set.seed(3)
train(match ~ .,
df_tr,
trControl = train_control,
method = "svmLinear")
})
)
df_messed_collection %>%
write_rds("data/paper/error_data/df_messed_collection_models.rds")
df_messed_collection
## # A tibble: 5 x 11
## error_rate df_a_mod df_pairs df_feature train_indices test_indices
## <dbl> <list> <list> <list> <list> <list>
## 1 0 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [2,618…
## 2 0.1 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [9,391…
## 3 0.2 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [13,93…
## 4 0.3 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [18,58…
## 5 0.4 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [22,73…
## # ... with 5 more variables: df_train <list>, df_test <list>,
## # model_rf <list>, model_svm_radial <list>, model_svm_linear <list>
df_messed_collection <-
df_messed_collection %>%
mutate(
results = map2(model_obj, df_test, function(model, df){
df %>%
mutate(pair_id = 1:nrow(.)) %>%
evaluate_model(model, df_test = ., plot_roc = F)
}),
metrics = map(results, ~.x$metrics$df_metric_table),
confusion_matrix = map(results, ~.x$metrics$confusion_matrix),
roc_curve = map(results, ~.x$metrics$roc_curve),
pred_confidence = map(results, ~.x$confidence)
)
df_model_metrics <-
df_messed_collection %>%
select(model, error_rate, metrics) %>%
unnest(metrics)
plot_roc_all(df_messed_collection %>% filter(model == "svm_radial"),
model_col = error_rate)df_model_metrics %>%
filter(metric %in% c("accuracy", "precision", "recall", "f1","auc")) %>%
ggplot(aes(error_rate, value, group = metric, col = metric)) +
geom_line() +
geom_point() +
scale_y_continuous(breaks = seq(0, 1, 0.2),
limits = c(0, 1),
minor_breaks = NULL)+
theme_light()df_model_metrics %>%
filter(metric %in% c("accuracy", "precision", "recall", "f1","auc")) %>%
ggplot(aes(metric, value, fill = model)) +
geom_col(position = "dodge") +
facet_grid(error_rate~model) +
coord_flip()